package com.otaku.crawler;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.otaku.crawler.model.USAShopCNPojo;
import com.otaku.crawler.parser.USAShopCNParser;
import com.otaku.crawler.util.Constants;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

public class USAShopCNCrawler extends WebCrawler{
	public boolean shouldVisit(WebURL url){
		String href = url.getURL().toLowerCase();
		return !Constants.FILTERS.matcher(href).matches() && href.startsWith("http://www.usashopcn.com/product/list");
	}
	
	public void visit(Page page){
		String url = page.getWebURL().getURL();
        System.out.println("URL: " + url);

        if (page.getParseData() instanceof HtmlParseData) {
                HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
                String html = htmlParseData.getHtml();
                
                String rHtml = html.substring(html.indexOf("<div id=\"product_list\" class=\"gradient_bg product_style_five\">"));
                String interestedHtml = rHtml.substring(0, rHtml.indexOf("<div id=\"pager\">"));
                
                interestedHtml = interestedHtml.replaceAll("<ul>", "").replaceAll("</ul>", "")
                			.replaceAll("<li>", "").replaceAll("</li>", "").replaceAll("&", Constants.AND_REPLACE).
                			replaceAll("computer=\"\" commuter=\"\" bag\"=\"\"", "");
                
//                Matcher m = AND_FILTER.matcher(interestedHtml);
//                interestedHtml = m.replaceAll("&amp;");
//                int index = interestedHtml.indexOf("&");
//                System.out.println(interestedHtml.substring(index-20, index+20));
                
                List<USAShopCNPojo> pol = USAShopCNParser.parse(interestedHtml);
                
                for(USAShopCNPojo p : pol){
                	System.out.println("\n================"+p.getName()+
                			"\n market:"+p.getMarketPrice()+
                			"\n proxy:"+p.getProxyPrice()+
                			"\n url:"+p.getUrl()+
                			"\n img url:"+p.getImgUrl());
                }
        }
	}
}
